{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# MapReduce"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "## Intuition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "a = [[1,2,1], [3,2], [4,9,1,0,2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "sums = map(sum, a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "sums = [] \n",
    "for sublist in a: \n",
    "    results = sum(sublist) \n",
    "    sums.append(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def add(a, b):\n",
    "    return a + b "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25\n"
     ]
    }
   ],
   "source": [
    "from functools import reduce\n",
    "print(reduce(add, sums, 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "initial = 0\n",
    "current_result = initial\n",
    "for element in sums:\n",
    "    current_result = add(current_result, element)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "## Basic Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "def map_word_count(document_id, document):\n",
    "    counts = defaultdict(int)\n",
    "    for word in document.split():\n",
    "        counts[word] += 1\n",
    "    for word in counts:\n",
    "        yield (word, counts[word])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def shuffle_words(results_generators):\n",
    "    records = defaultdict(list)\n",
    "    for results in results_generators:\n",
    "        for word, count in results:\n",
    "            records[word].append(count)\n",
    "    for word in records:\n",
    "        yield (word, records[word])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def reduce_counts(word, list_of_counts):\n",
    "    return (word, sum(list_of_counts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "dataset = fetch_20newsgroups(subset='train')\n",
    "documents = dataset.data[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "map_results = map(map_word_count, range(len(documents)), documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "shuffle_results = shuffle_words(map_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "reduce_results = [reduce_counts(word, list_of_counts) for word, list_of_counts in shuffle_results]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('coming', 1), (\"couldn't\", 4), ('Jose,', 1), ('{As', 1), ('185c', 1)]\n",
      "5036\n"
     ]
    }
   ],
   "source": [
    "print(reduce_results[:5])\n",
    "print(len(reduce_results))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from joblib import Parallel, delayed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def map_word_count(document_id, document):\n",
    "    counts = defaultdict(int)\n",
    "    for word in document.split():\n",
    "        counts[word] += 1\n",
    "    return list(counts.items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "map_results = Parallel(n_jobs=2)(delayed(map_word_count)(i, document)\n",
    "                                 for i, document in enumerate(documents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "shuffle_results = shuffle_words(map_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('coming', [1]),\n",
       " (\"couldn't\", [1, 1, 1, 1]),\n",
       " ('Jose,', [1]),\n",
       " ('{As', [1]),\n",
       " ('185c', [1]),\n",
       " ('burst', [5]),\n",
       " ('context.', [1]),\n",
       " ('copy,', [1]),\n",
       " ('**********************************************************************',\n",
       "  [1]),\n",
       " ('Modular', [1]),\n",
       " ('Yeah,', [1]),\n",
       " ('parking', [1]),\n",
       " ('Prices!', [1]),\n",
       " ('em', [1]),\n",
       " ('record,', [1]),\n",
       " ('program', [1]),\n",
       " ('>philosophically<', [1]),\n",
       " ('kind', [1, 1]),\n",
       " ('opinions', [2, 1, 1]),\n",
       " ('cubic', [1]),\n",
       " ('vision', [1]),\n",
       " ('later', [1, 1, 1]),\n",
       " ('$3495,', [1]),\n",
       " ('she', [2, 1]),\n",
       " ('xray@is.rice.edu', [1]),\n",
       " ('up', [2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 3]),\n",
       " ('Callison', [1]),\n",
       " ('v8', [1]),\n",
       " ('No', [6, 1]),\n",
       " ('disobeys', [1]),\n",
       " ('term?', [1]),\n",
       " ('login', [1]),\n",
       " ('Most', [1, 1, 1, 3, 1]),\n",
       " ('kept', [1]),\n",
       " ('(Repost)', [1]),\n",
       " ('mean', [1, 1, 1]),\n",
       " ('luck,', [1]),\n",
       " ('punisher.caltech.edu', [1]),\n",
       " ('nCUBE', [1]),\n",
       " ('result', [1]),\n",
       " ('Problems???', [1]),\n",
       " ('(I', [2, 1]),\n",
       " ('Grow', [1]),\n",
       " ('Goalie', [1]),\n",
       " ('Binoculars', [1]),\n",
       " ('boots),', [1]),\n",
       " ('multiple', [3]),\n",
       " ('At', [1, 1, 1]),\n",
       " ('Nearby', [1]),\n",
       " (\"won't-\", [1]),\n",
       " ('however', [1]),\n",
       " ('one', [1, 3, 2, 8, 1, 1, 1, 2, 5, 1, 2, 3, 1, 1]),\n",
       " ('Vijay', [2]),\n",
       " ('great.', [1, 1]),\n",
       " ('stuff', [1, 1, 1]),\n",
       " ('problem.', [1]),\n",
       " ('movies', [4]),\n",
       " ('associated', [1]),\n",
       " ('continues', [1]),\n",
       " ('Call', [1, 1]),\n",
       " ('(David', [2, 1]),\n",
       " ('hand-cocked', [1]),\n",
       " ('Brewers', [1]),\n",
       " ('btw.', [1]),\n",
       " ('game,', [1]),\n",
       " (\">(there's\", [1]),\n",
       " ('boy,', [1]),\n",
       " ('safest', [1]),\n",
       " ('add', [2, 1, 1, 2, 1]),\n",
       " ('mos.', [2]),\n",
       " ('references', [1, 1]),\n",
       " ('Negev', [1]),\n",
       " ('nuclear', [1, 5]),\n",
       " ('stack@translab.its.uci.edu', [1]),\n",
       " ('thought.', [1]),\n",
       " ('this;', [1]),\n",
       " ('racers,', [1]),\n",
       " ('things\"', [2]),\n",
       " ('said,', [1, 1, 1]),\n",
       " ('it,', [1, 1, 1]),\n",
       " ('best?', [1]),\n",
       " (\"How's\", [1]),\n",
       " ('Silex', [1]),\n",
       " ('0-5MB/s', [1]),\n",
       " ('necessary)', [1]),\n",
       " ('\"If', [2]),\n",
       " ('semi-autos', [3]),\n",
       " ('destruction', [3]),\n",
       " ('saying', [1, 1]),\n",
       " ('it:', [1]),\n",
       " ('29', [1]),\n",
       " ('dealers', [1]),\n",
       " ('agrees', [1]),\n",
       " ('low', [1]),\n",
       " ('round', [1, 1, 1]),\n",
       " ('fulfilled', [1]),\n",
       " ('Clause;', [1]),\n",
       " ('Diskdoubler,', [1]),\n",
       " ('$60', [1]),\n",
       " ('exotic', [1]),\n",
       " ('Such', [1]),\n",
       " ('conditions,', [1]),\n",
       " ('tellme', [1]),\n",
       " ('up??', [1]),\n",
       " ('abarden@afseo.eglin.af.mil', [1]),\n",
       " ('wanted', [1]),\n",
       " ('Does', [1, 1]),\n",
       " ('annul', [1]),\n",
       " ('mouth', [1, 1]),\n",
       " ('appreciate', [1]),\n",
       " ('gave', [1, 1, 1]),\n",
       " ('Krueger)', [1]),\n",
       " ('less.', [2]),\n",
       " ('Statistics', [1]),\n",
       " ('anything', [1, 1, 2, 1, 1]),\n",
       " ('space', [2, 1]),\n",
       " ('Launch', [1, 1]),\n",
       " ('station', [1]),\n",
       " ('COrrado', [1]),\n",
       " ('troubled', [1]),\n",
       " ('establishes', [1]),\n",
       " ('MY', [1]),\n",
       " ('51.6', [1]),\n",
       " ('8.3', [1]),\n",
       " ('pharmacists', [1]),\n",
       " ('workstation.', [1]),\n",
       " ('160', [2]),\n",
       " ('Hewlett', [2]),\n",
       " ('Gun', [2]),\n",
       " ('lawyers', [1]),\n",
       " ('10', [1, 2, 1, 1]),\n",
       " ('Dwayne', [1]),\n",
       " ('Zoom', [1]),\n",
       " ('font', [4]),\n",
       " ('(Theodore', [1]),\n",
       " ('keith', [1]),\n",
       " ('(as', [1, 1, 1, 1]),\n",
       " ('IBM', [3, 1]),\n",
       " (\"Colt's\", [1]),\n",
       " ('corner', [1]),\n",
       " ('Atheists?', [1]),\n",
       " ('LX', [1]),\n",
       " ('MSFC,', [1]),\n",
       " ('>Excerpts', [1]),\n",
       " ('town', [2]),\n",
       " ('F-150', [1]),\n",
       " ('signing.', [2]),\n",
       " ('series', [1]),\n",
       " ('stalled', [1]),\n",
       " ('Lustig', [1]),\n",
       " ('bogus.', [1]),\n",
       " ('holes', [1, 1]),\n",
       " ('after', [1, 1, 4, 1, 3, 1]),\n",
       " ('wingless', [1]),\n",
       " ('checked', [1]),\n",
       " ('excellent', [1]),\n",
       " ('because', [1, 1, 1, 2, 2, 1, 2, 4, 1, 1, 1, 3]),\n",
       " ('(the', [1, 1, 1, 1, 1]),\n",
       " ('Bonilla,', [1]),\n",
       " ('sez;', [1]),\n",
       " ('WHAT', [1]),\n",
       " ('deterring', [1]),\n",
       " ('Germany,', [2]),\n",
       " ('old', [1, 1, 2]),\n",
       " ('do', [1, 2, 1, 1, 1, 6, 1, 1, 1, 2, 1, 1, 2]),\n",
       " ('applicable', [1]),\n",
       " ('1300', [1]),\n",
       " ('important?', [1]),\n",
       " ('least,', [1]),\n",
       " ('initially),', [1]),\n",
       " (\"doens't\", [1]),\n",
       " ('tents,', [1]),\n",
       " ('adam@endor.uucp', [1]),\n",
       " ('two', [1, 1, 1, 1, 1, 1, 1, 1, 1]),\n",
       " ('lists', [1]),\n",
       " ('(which', [1, 1]),\n",
       " ('T,', [2]),\n",
       " ('Oklahoma;', [1]),\n",
       " ('five', [1]),\n",
       " ('sound', [1]),\n",
       " ('optimize', [1]),\n",
       " ('rated', [1, 1]),\n",
       " ('computed', [1]),\n",
       " ('post', [1, 1, 1, 1, 1, 1]),\n",
       " ('immediately', [1, 1]),\n",
       " ('In', [1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1]),\n",
       " ('recieved', [1]),\n",
       " ('Maryland,', [1]),\n",
       " ('disregardful', [1]),\n",
       " ('him)', [1]),\n",
       " ('somewhere', [1]),\n",
       " ('Data', [1]),\n",
       " ('SSF', [9]),\n",
       " ('Soviet', [1]),\n",
       " ('Car:', [1]),\n",
       " ('air;', [1]),\n",
       " ('sky', [1]),\n",
       " ('tombaker@world.std.com', [1]),\n",
       " ('posting', [1]),\n",
       " ('were', [1, 1, 1, 3, 3, 3, 1, 5, 1]),\n",
       " ('configuration', [2]),\n",
       " ('>--', [1, 1, 1]),\n",
       " ('reduced', [1]),\n",
       " ('Mountain', [1]),\n",
       " ('Vanbiesbrouck.', [1]),\n",
       " ('purified', [1]),\n",
       " ('doubt', [1, 1, 1, 1]),\n",
       " ('Leafs,', [1]),\n",
       " ('Funny', [1]),\n",
       " ('DD', [1]),\n",
       " ('{16-bit/wide', [1]),\n",
       " ('Kuo)', [1]),\n",
       " ('son', [3]),\n",
       " ('Sunday', [1]),\n",
       " ('means).', [1]),\n",
       " ('.481,', [1]),\n",
       " (\"you're\", [1, 1, 1, 1, 1, 1, 1]),\n",
       " ('>Lawrence', [1]),\n",
       " ('freeware', [1]),\n",
       " ('Loney', [1]),\n",
       " ('bad', [1]),\n",
       " ('links', [1]),\n",
       " ('Bonilla', [2]),\n",
       " ('beachball!\"', [1]),\n",
       " ('scares', [1]),\n",
       " ('care.', [1]),\n",
       " ('hot', [3]),\n",
       " ('[ssa@unity.ncsu.edu]', [1]),\n",
       " ('integer.', [1]),\n",
       " ('use.', [1]),\n",
       " ('Gosh..I', [1]),\n",
       " ('mode}:', [2]),\n",
       " ('motto', [1]),\n",
       " ('(Lemieux)', [1]),\n",
       " ('HoloNet', [1]),\n",
       " ('much', [1, 1, 1, 1, 3, 1, 1, 1, 1, 1]),\n",
       " ('>Folks,', [1]),\n",
       " ('philosophical', [1]),\n",
       " ('cases', [1, 1, 1]),\n",
       " ('who/what', [1]),\n",
       " ('controller', [4]),\n",
       " ('Insurance', [5]),\n",
       " ('>then', [1]),\n",
       " ('None', [1]),\n",
       " ('shaky', [1]),\n",
       " ('(Operator)', [1]),\n",
       " ('1:', [1]),\n",
       " ('re-claimed', [1]),\n",
       " ('keith@cco.caltech.edu', [1]),\n",
       " ('trial\",', [1]),\n",
       " ('calling', [1]),\n",
       " ('solicit', [1]),\n",
       " ('works', [1, 1, 1]),\n",
       " ('Division', [2, 1]),\n",
       " ('--salty', [1]),\n",
       " ('--', [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),\n",
       " ('cars.', [2]),\n",
       " ('Nazis),', [1]),\n",
       " ('leads', [1]),\n",
       " ('FTP', [1]),\n",
       " ('Foligno', [1]),\n",
       " ('solar', [2]),\n",
       " (\"You'll\", [1]),\n",
       " ('CIRCUIT', [1]),\n",
       " ('up,', [1]),\n",
       " ('yesterday.', [2]),\n",
       " ('924,', [3]),\n",
       " ('same,', [1]),\n",
       " ('launch,', [1]),\n",
       " ('################################################################################',\n",
       "  [2]),\n",
       " ('received.', [1, 1]),\n",
       " ('jcm@head-cfa.harvard.edu', [1]),\n",
       " ('.NOT.', [1]),\n",
       " ('centerline', [1]),\n",
       " ('$80', [1]),\n",
       " ('Etzion,', [1]),\n",
       " ('increased', [1]),\n",
       " ('12-2', [1]),\n",
       " ('airbag,', [1]),\n",
       " ('empty', [1]),\n",
       " ('trial', [1]),\n",
       " ('children).', [1]),\n",
       " ('decided', [2, 1]),\n",
       " ('\"nigger\"', [1]),\n",
       " ('well.', [1]),\n",
       " ('No.', [1, 1]),\n",
       " ('strictly', [1]),\n",
       " ('technology.', [1]),\n",
       " ('Y', [2]),\n",
       " ('smith', [1]),\n",
       " ('helps.', [3, 1]),\n",
       " ('thing)', [1]),\n",
       " ('1000', [1, 1]),\n",
       " ('Army', [1]),\n",
       " ('respect.', [1]),\n",
       " (\"insurance's\", [1]),\n",
       " ('spec,', [1]),\n",
       " ('>fossil', [2]),\n",
       " ('56', [1]),\n",
       " ('stop', [2, 1]),\n",
       " ('Airplane', [1]),\n",
       " ('into', [1, 1, 1, 1, 1, 1, 1, 1, 1]),\n",
       " ('fancy', [1]),\n",
       " ('support)', [1]),\n",
       " ('(docking', [1]),\n",
       " ('citizens', [1, 1]),\n",
       " ('\"inappropriate\"', [1]),\n",
       " ('VAX/VMS', [1]),\n",
       " ('55;', [1]),\n",
       " ('debris)', [1]),\n",
       " ('code', [1, 1]),\n",
       " ('murmurs', [1]),\n",
       " ('own.', [1, 1]),\n",
       " ('>every<', [1]),\n",
       " ('finish', [1]),\n",
       " ('\"subrogation.\"', [1]),\n",
       " (\"haven't\", [1, 1, 1, 1]),\n",
       " ('hurts', [1]),\n",
       " ('most', [1, 1, 2, 1, 2, 1, 1]),\n",
       " ('unnecessarily', [1]),\n",
       " ('1948:', [1]),\n",
       " ('43', [1]),\n",
       " ('Focus', [1]),\n",
       " ('played', [1]),\n",
       " ('Syphers)', [1]),\n",
       " ('[version', [1, 1, 1]),\n",
       " ('specific', [1, 1]),\n",
       " ('death.', [1]),\n",
       " ('on.', [1, 1, 1]),\n",
       " ('cryptology;', [1]),\n",
       " ('R_Tim_Coslet@cup.portal.', [1]),\n",
       " ('(Portable', [1]),\n",
       " ('bgrubb@dante.nmsu.edu', [2]),\n",
       " ('brings', [1]),\n",
       " ('Nagle', [1]),\n",
       " ('way.', [1, 1, 1]),\n",
       " ('YEARS.', [1]),\n",
       " ('there?', [1]),\n",
       " ('fault,', [1]),\n",
       " ('Lockheed', [2]),\n",
       " ('SO', [1]),\n",
       " ('Lloyd', [1]),\n",
       " ('Fox)', [1]),\n",
       " ('intention', [1]),\n",
       " ('expressing', [1]),\n",
       " ('abarden@tybse1.uucp', [1]),\n",
       " ('magnification', [1]),\n",
       " ('managers', [1]),\n",
       " ('preferably', [1]),\n",
       " ('crook,', [1]),\n",
       " ('knowledge', [1]),\n",
       " ('Stac', [2]),\n",
       " ('too)', [1]),\n",
       " ('via', [1]),\n",
       " ('Redesign', [1]),\n",
       " ('policy', [3]),\n",
       " ('<Apr.9.08.39.25.1993.15639@romulus.rutgers.edu>', [1]),\n",
       " ('missions', [1]),\n",
       " ('\"little', [2]),\n",
       " ('story', [1]),\n",
       " ('practice.', [1]),\n",
       " ('gripe', [1]),\n",
       " ('important.', [1]),\n",
       " ('why', [1, 1, 1, 1, 1, 2]),\n",
       " ('EVA', [1]),\n",
       " ('Jonathan_Hayward@wheaton.edu', [1]),\n",
       " ('law', [1]),\n",
       " ('Division,', [1]),\n",
       " ('spacify', [1]),\n",
       " ('SS10', [1]),\n",
       " ('younger,', [1]),\n",
       " ('bugs,', [1]),\n",
       " ('frost', [1]),\n",
       " ('turn', [1]),\n",
       " ('shooting', [1]),\n",
       " ('subjectiveness.', [1]),\n",
       " ('Yassin', [1]),\n",
       " ('grossly', [1]),\n",
       " ('Israel', [2]),\n",
       " ('(Hope', [1]),\n",
       " ('corner...', [1]),\n",
       " ('foot', [1]),\n",
       " ('theft', [2]),\n",
       " ('Allstate.', [1]),\n",
       " ('Disclaimer:', [1]),\n",
       " ('P.S.', [1]),\n",
       " ('big', [1, 1, 1]),\n",
       " ('News-Software:', [1]),\n",
       " ('are',\n",
       "  [1, 2, 2, 1, 2, 4, 8, 1, 2, 4, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 8, 2, 3]),\n",
       " ('jammed!\"', [1]),\n",
       " ('society.', [1]),\n",
       " ('course),', [1]),\n",
       " ('concrete', [1]),\n",
       " ('BeHanna)', [1]),\n",
       " ('1200', [1]),\n",
       " ('distinguish', [1]),\n",
       " ('THERMOCOUPLE', [1]),\n",
       " ('cage).', [1]),\n",
       " ('voice', [2, 1]),\n",
       " ('no.', [1]),\n",
       " ('autumn', [1]),\n",
       " ('(brian', [1]),\n",
       " ('undoubtably', [1]),\n",
       " ('Ken,', [1]),\n",
       " ('>>>', [3]),\n",
       " (\"weren't\", [1]),\n",
       " ('carson.u.washington.edu', [1]),\n",
       " ('FASTER', [1]),\n",
       " ('off', [1, 1, 4, 1, 2, 1]),\n",
       " ('inquiry', [1]),\n",
       " ('Article-I.D.:', [1, 1]),\n",
       " ('difference', [1]),\n",
       " ('ambient.', [1]),\n",
       " ('head', [1]),\n",
       " ('<1r466c$an3@news.intercon.com>', [1]),\n",
       " ('do.', [1]),\n",
       " ('Marks)', [1]),\n",
       " ('pay', [4, 2]),\n",
       " ('controler', [1]),\n",
       " ('Flyers.', [1]),\n",
       " ('our', [1]),\n",
       " ('Plesac', [1]),\n",
       " ('turning,', [1]),\n",
       " ('boundaries', [1]),\n",
       " ('>removing', [1]),\n",
       " ('existing', [3]),\n",
       " ('malnutrition', [1]),\n",
       " ('8-bit', [1]),\n",
       " ('require', [1]),\n",
       " ('>contamination.', [1]),\n",
       " ('play', [1, 1]),\n",
       " ('Shea.', [1]),\n",
       " ('{With', [1]),\n",
       " ('Card)', [1]),\n",
       " ('lefties', [2]),\n",
       " (\"'stay'\", [1]),\n",
       " ('mask', [3]),\n",
       " ('Cerkoney)', [1]),\n",
       " ('flow', [1]),\n",
       " ('unless', [1, 1, 1, 1]),\n",
       " ('Kyanko', [1]),\n",
       " ('>capacity', [1]),\n",
       " ('Station.', [1]),\n",
       " ('\"Behind', [1]),\n",
       " ('econoboxes', [1]),\n",
       " ('fighting', [3]),\n",
       " ('Wisconsin', [1]),\n",
       " ('successful.', [1]),\n",
       " ('bill.', [1]),\n",
       " ('entry', [2]),\n",
       " ('torn', [1]),\n",
       " ('Great!', [1]),\n",
       " ('stands', [1]),\n",
       " ('said', [1, 1, 1]),\n",
       " ('Yale', [1]),\n",
       " ('new', [1, 1, 1, 1, 1]),\n",
       " ('Studies,', [1]),\n",
       " ('4', [2, 1]),\n",
       " ('Rather', [1]),\n",
       " ('generation,', [1]),\n",
       " ('Tape.Tape', [1]),\n",
       " ('faster', [2, 2, 1]),\n",
       " ('Treatment', [1]),\n",
       " ('kmr4@po.CWRU.edu', [1]),\n",
       " (\">That's\", [1, 1]),\n",
       " ('$1,000,000', [1]),\n",
       " ('yrs', [1]),\n",
       " ('rotate....', [1]),\n",
       " ('Subscriber', [1]),\n",
       " ('boilers.', [1]),\n",
       " ('HELP', [1]),\n",
       " ('@', [1, 1]),\n",
       " ('joplin.biosci.arizona.edu', [1]),\n",
       " ('left', [1, 1, 1, 2]),\n",
       " ('---', [1, 1]),\n",
       " ('top', [1, 1]),\n",
       " ('chalk', [1]),\n",
       " ('TIFF,', [1]),\n",
       " ('roughly', [1]),\n",
       " ('18', [1]),\n",
       " ('dubing,', [1]),\n",
       " ('who', [1, 2, 1, 1, 1, 1, 2, 2, 1, 5, 1, 1, 2, 1]),\n",
       " ('bronze/brown/orange', [1]),\n",
       " ('$2,500.', [1]),\n",
       " ('MOVIES', [1]),\n",
       " ('Even', [1, 1, 1]),\n",
       " ('$500', [3]),\n",
       " ('how', [2, 1, 1, 1, 1, 1]),\n",
       " ('versus', [1]),\n",
       " ('developed', [2]),\n",
       " ('(full-cover,', [1]),\n",
       " ('out.', [1]),\n",
       " ('Presence', [2]),\n",
       " ('put', [1, 1, 1, 1]),\n",
       " ('questions', [1]),\n",
       " ('three-on-three', [1]),\n",
       " ('environments', [1]),\n",
       " ('[via', [1]),\n",
       " ('Naval', [1]),\n",
       " ('Vincint', [1]),\n",
       " ('shelley.1qvfo9INNc3s', [1]),\n",
       " ('6th', [1]),\n",
       " ('tended', [1]),\n",
       " ('Zealand', [1]),\n",
       " ('smithsonian', [1]),\n",
       " ('here', [1, 1]),\n",
       " ('>deductible,', [1]),\n",
       " ('mind', [1, 1]),\n",
       " (')>>', [7]),\n",
       " ('job', [1, 1]),\n",
       " ('Mudd', [1]),\n",
       " ('NYT', [1]),\n",
       " ('utilize', [1]),\n",
       " ('unlikely', [1]),\n",
       " ('stpl.ists.ca', [1]),\n",
       " ('now),', [1]),\n",
       " ('nose.', [1]),\n",
       " ('Sabres', [1]),\n",
       " ('protest', [1]),\n",
       " ('this?', [1]),\n",
       " ('reach', [1]),\n",
       " ('xandor@unixg.ubc.ca', [1]),\n",
       " ('money', [1, 1, 1, 1]),\n",
       " ('\"B\"', [2]),\n",
       " ('raised', [1]),\n",
       " ('Piaget)', [1]),\n",
       " ('Player,', [1]),\n",
       " (\"there's\", [1]),\n",
       " ('AT&T', [1]),\n",
       " ('Albert', [1]),\n",
       " ('rifles.', [1]),\n",
       " ('chin', [1]),\n",
       " ('Jaha', [1]),\n",
       " (\"I've\", [3, 1, 1, 3, 1, 1, 4]),\n",
       " ('standardized,', [1]),\n",
       " ('Email:', [1, 1]),\n",
       " ('Cipale)', [1]),\n",
       " ('message', [1, 1]),\n",
       " ('guest)', [1]),\n",
       " ('woke', [1]),\n",
       " ('memory.', [1]),\n",
       " ('$24', [1]),\n",
       " ('(Eli', [1]),\n",
       " ('child', [6]),\n",
       " ('under/into.', [1]),\n",
       " ('humor.\"', [1]),\n",
       " ('enough).', [1]),\n",
       " ('Kerr)', [1]),\n",
       " ('(Amanda', [1]),\n",
       " ('Illinois/Urbana', [1]),\n",
       " ('appearance.', [1]),\n",
       " ('stable', [1]),\n",
       " ('Nietzsche', [1]),\n",
       " ('World', [1]),\n",
       " ('attacking.', [1]),\n",
       " ('tapped.', [1]),\n",
       " ('enough,', [1, 1]),\n",
       " ('berthing', [1]),\n",
       " ('chamber', [3]),\n",
       " ('scuffling?', [1]),\n",
       " ('Appendix', [1]),\n",
       " ('captains', [1]),\n",
       " ('>originally', [1]),\n",
       " ('missing', [1, 1]),\n",
       " ('is,', [1]),\n",
       " ('ST', [1]),\n",
       " ('heard', [3, 1, 1]),\n",
       " ('thought,', [1]),\n",
       " ('1900', [1]),\n",
       " ('Tucson', [1]),\n",
       " ('Corp.,', [1]),\n",
       " ('<26', [1]),\n",
       " ('flags', [1]),\n",
       " ('interested,', [1]),\n",
       " ('downright', [1]),\n",
       " ('sucked.', [1]),\n",
       " ('semi-autos.', [1]),\n",
       " ('unknowable,', [1]),\n",
       " ('$520/6', [1]),\n",
       " ('advantage).', [1]),\n",
       " ('around', [2, 1, 1, 1, 1]),\n",
       " ('on-board.', [1]),\n",
       " ('info:', [1]),\n",
       " ('PA', [2, 1]),\n",
       " ('wierd', [1]),\n",
       " ('ssa@unity.ncsu.edu', [1]),\n",
       " (\"(he'd\", [1]),\n",
       " ('1970', [2]),\n",
       " ('high', [1, 2, 2]),\n",
       " ('98%', [1]),\n",
       " ('<1993Apr20.151818.4319@samba.oit.unc.edu>', [1]),\n",
       " ('someone', [1, 1]),\n",
       " ('better).', [1]),\n",
       " (\"wanna-be's),\", [1]),\n",
       " ('other', [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1]),\n",
       " ('final', [1]),\n",
       " ('modular', [1]),\n",
       " ('modern', [2, 1]),\n",
       " ('cradle', [1]),\n",
       " ('distance!).', [1]),\n",
       " ('>swashbuckling', [1]),\n",
       " ('11', [1, 1, 1]),\n",
       " ('equal', [1]),\n",
       " ('stolen', [1]),\n",
       " ('image,', [1, 2]),\n",
       " ('group', [1]),\n",
       " ('occur', [1]),\n",
       " ('>SCSI-I', [1]),\n",
       " ('propulsion', [1]),\n",
       " ('Runs', [1]),\n",
       " ('salesman', [1]),\n",
       " ('simply', [1]),\n",
       " ('doubts', [1]),\n",
       " ('battle', [1]),\n",
       " ('Front', [1]),\n",
       " ('failures', [1]),\n",
       " ('aside,', [1, 1]),\n",
       " ('thing\"', [1]),\n",
       " ('shuttle', [2]),\n",
       " ('granite', [1]),\n",
       " ('admin', [1]),\n",
       " ('time.', [1, 1]),\n",
       " ('attack', [1, 1]),\n",
       " ('service', [1, 1]),\n",
       " ('glad', [1]),\n",
       " ('learns', [1]),\n",
       " ('accidentally', [1]),\n",
       " ('336-9591', [1]),\n",
       " ('summer\"', [1]),\n",
       " ('LX.)', [1]),\n",
       " ('intended', [1, 1]),\n",
       " ('came', [1, 1]),\n",
       " ('dirtbike', [1]),\n",
       " ('first', [2, 1, 3]),\n",
       " ('there)}', [1]),\n",
       " ('own', [1, 1, 1, 1, 1, 1]),\n",
       " ('Sundheim)', [1]),\n",
       " ('And,', [1]),\n",
       " ('monitor.', [1]),\n",
       " ('middle', [1, 2]),\n",
       " ('jap', [1]),\n",
       " ('Remote', [1]),\n",
       " ('fault', [1]),\n",
       " ('summary', [1, 2, 1, 1]),\n",
       " ('helpful', [1]),\n",
       " ('man', [1, 1]),\n",
       " ('rarely', [1]),\n",
       " ('floppy..BURN', [1]),\n",
       " (\"ERA's\", [1]),\n",
       " ('shell', [1]),\n",
       " ('>Oddly,', [1]),\n",
       " ('>machines\".', [1]),\n",
       " ('(Abraham', [1]),\n",
       " ('do\"', [1]),\n",
       " ('key', [1, 1]),\n",
       " ('range', [1]),\n",
       " ('also),', [1]),\n",
       " ('with:', [1]),\n",
       " ('prepared', [1]),\n",
       " ('(919)467-7909', [1]),\n",
       " ('expelled?', [1]),\n",
       " ('cool', [1]),\n",
       " ('towing,', [1]),\n",
       " ('signal', [1]),\n",
       " ('liked.', [1]),\n",
       " ('knowledge.', [1]),\n",
       " ('user', [1]),\n",
       " ('game.', [1]),\n",
       " ('improves', [1]),\n",
       " ('necessarily', [1]),\n",
       " ('Then', [1]),\n",
       " ('happy', [1]),\n",
       " ('use', [1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1]),\n",
       " ('L', [1]),\n",
       " ('2:', [1]),\n",
       " ('too', [2]),\n",
       " ('rac3.wam.umd.edu', [1]),\n",
       " ('$80/year', [1]),\n",
       " ('tech', [1]),\n",
       " ('time', [1, 1, 1, 1, 2, 1]),\n",
       " ('bangkok', [1]),\n",
       " ('3)Monolux', [1]),\n",
       " ('ADC', [1]),\n",
       " ('known', [2, 2]),\n",
       " ('am\"', [1]),\n",
       " ('lesson', [1]),\n",
       " ('ranks', [1]),\n",
       " ('129.89.7.4', [1]),\n",
       " ('>If', [1]),\n",
       " ('However,', [1, 1, 1, 1, 1, 2]),\n",
       " ('whom', [1]),\n",
       " ('before.', [1]),\n",
       " ('lacked', [1]),\n",
       " ('image\"', [1]),\n",
       " ('box', [1]),\n",
       " ('weapons', [7]),\n",
       " ('tolerance', [1]),\n",
       " ('effectively', [1]),\n",
       " (\">'Cooling\", [1]),\n",
       " ('constitute', [1]),\n",
       " ('want', [1, 1, 1, 2, 1, 1, 1, 2, 1]),\n",
       " ('relationship', [1]),\n",
       " ('fixable.', [1]),\n",
       " ('(this', [1]),\n",
       " ('on)', [1]),\n",
       " ('considers', [1]),\n",
       " ('back', [1, 1, 1, 1, 1]),\n",
       " ('worn', [1]),\n",
       " ('year', [1, 6, 1]),\n",
       " ('accidents...', [1]),\n",
       " ('(Foxvog', [2]),\n",
       " ('shop', [1]),\n",
       " ('$3K.', [1]),\n",
       " ('Treat', [1]),\n",
       " ('being', [2, 1, 2, 1, 1, 2, 1]),\n",
       " ('defensive', [2]),\n",
       " ('>BZZZT!', [1]),\n",
       " ('future', [2, 1]),\n",
       " ('YOURS.', [1]),\n",
       " ('driving', [5]),\n",
       " ('dogs.', [1]),\n",
       " ('mode', [1, 2]),\n",
       " ('Voic', [1]),\n",
       " ('dirt', [1]),\n",
       " (\"let's\", [1]),\n",
       " ('phone,', [1]),\n",
       " ('competition.)', [1]),\n",
       " ('serious.', [1]),\n",
       " ('Original-Sender:', [1]),\n",
       " ('>>long', [1]),\n",
       " ('(James', [1, 1]),\n",
       " ('starting', [1, 2]),\n",
       " ('turbo', [1]),\n",
       " ('exception', [1]),\n",
       " ('restraint', [1]),\n",
       " ('foxvog', [1]),\n",
       " ('Man-Tended', [1]),\n",
       " ('(Assured', [1]),\n",
       " ('40MB/s', [1]),\n",
       " ('Sci,', [1]),\n",
       " ('humanity.', [1]),\n",
       " ('her,', [1]),\n",
       " ('Eli', [1]),\n",
       " ('21', [2, 1, 1]),\n",
       " ('pack', [2, 1]),\n",
       " ('subtly', [1]),\n",
       " ('Vaive', [1]),\n",
       " ('USL', [1]),\n",
       " ('swear,', [1]),\n",
       " ('qualified', [1]),\n",
       " ('nothing', [1]),\n",
       " ('mass', [7]),\n",
       " ('chip}', [1]),\n",
       " ('saw,', [1]),\n",
       " ('info', [1, 1, 1, 1, 1, 1]),\n",
       " ('krueger@helium.gas.uug.arizona.edu', [1]),\n",
       " ('Management.', [1]),\n",
       " ('P9000', [2]),\n",
       " ('Communications,', [1]),\n",
       " ('citizenship.', [1]),\n",
       " ('straight,', [1]),\n",
       " ('Well', [1]),\n",
       " ('Attacks', [1]),\n",
       " ('Hebron,', [1]),\n",
       " ('Still,', [1]),\n",
       " ('\"those', [1]),\n",
       " ('WFAN', [1]),\n",
       " ('$2000', [2]),\n",
       " (\"Vanbiesbrouck's\", [1]),\n",
       " ('propulsion,', [1]),\n",
       " ('(harleys,', [1]),\n",
       " ('factor', [1, 1]),\n",
       " ('jaskew@spam.maths.adelaide.edu.au', [1]),\n",
       " ('PCs,', [1]),\n",
       " ('Stolen?', [1]),\n",
       " ('\"Convictions', [1]),\n",
       " ('two.', [1]),\n",
       " ('\"It\\'s', [1, 1]),\n",
       " ('pub', [1]),\n",
       " ('lictor.acsu.buffalo.edu', [1]),\n",
       " ('mention', [1, 1]),\n",
       " ('be...', [1]),\n",
       " ('Celica', [4]),\n",
       " ('weak-encryption', [1]),\n",
       " ('Mellon,', [1]),\n",
       " ('feel', [1, 1]),\n",
       " ('they', [2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1]),\n",
       " ('IT...I', [1]),\n",
       " ('neat', [1]),\n",
       " ('till', [1, 1]),\n",
       " ('Air', [1]),\n",
       " (\"team's\", [1]),\n",
       " ('Script', [1]),\n",
       " ('Brewer', [1]),\n",
       " ('Hair', [1]),\n",
       " ('postseason,', [1]),\n",
       " ('tickets', [1]),\n",
       " ('VOICE', [1]),\n",
       " ('ordering', [1]),\n",
       " ('nodes', [1]),\n",
       " ('especially', [1]),\n",
       " ('X', [1]),\n",
       " ('industry', [1]),\n",
       " ('R', [1, 1]),\n",
       " ('/~~\\\\', [1]),\n",
       " ('Thanks,', [1, 1, 1]),\n",
       " ('email,', [1, 1]),\n",
       " ('Canopies', [1]),\n",
       " ('blow', [1]),\n",
       " ('Auto', [1]),\n",
       " ('\"A\"', [2]),\n",
       " ('GREAT!),', [1]),\n",
       " ('>argue', [1]),\n",
       " ('suspending', [1]),\n",
       " ('>sold', [1]),\n",
       " ('writing', [1]),\n",
       " ('120,', [1]),\n",
       " ('|/', [1]),\n",
       " ('1000yds.', [1]),\n",
       " ('61', [1]),\n",
       " ('bag!\"', [1]),\n",
       " ('1.1', [1, 1, 1]),\n",
       " ('escapes', [1]),\n",
       " ('lies.\"', [1]),\n",
       " ('Common', [1]),\n",
       " ('good?).', [1]),\n",
       " ('_|/_', [1]),\n",
       " ('>City,', [1]),\n",
       " ('Controlled', [1]),\n",
       " ('probably', [1, 1, 1, 1, 1, 1, 1, 1, 1]),\n",
       " ('tm0006.lerc.nasa.gov', [1]),\n",
       " ('interpretation', [2]),\n",
       " ('that.', [1]),\n",
       " ('parent.', [1]),\n",
       " ('war?', [1]),\n",
       " ('steam', [3]),\n",
       " ('16-Apr-93', [1]),\n",
       " ('Sharon).', [1]),\n",
       " ('implementation', [1]),\n",
       " ('thought', [1, 1]),\n",
       " ('System:', [1]),\n",
       " (\"we've\", [1]),\n",
       " ('tend', [1]),\n",
       " ('Talon,', [1]),\n",
       " ('lucky.', [1]),\n",
       " ('deduction)', [1]),\n",
       " ('Looking', [1, 1]),\n",
       " ('directly', [1]),\n",
       " ('shows', [1]),\n",
       " ('what', [2, 2, 4, 2, 3, 1, 2, 1, 1, 2, 2]),\n",
       " ('right,', [1]),\n",
       " ('Jesus', [1]),\n",
       " ('semi', [6]),\n",
       " ('destruction?', [1]),\n",
       " ('NL', [2, 1]),\n",
       " ('his', [3, 1, 1, 3]),\n",
       " ('Sad,', [1]),\n",
       " ('system', [1, 1, 1, 1, 1, 1]),\n",
       " ('hammer', [4]),\n",
       " ('{120%', [1]),\n",
       " ('sea.', [1]),\n",
       " ('r4938585@joplin.biosci.arizona.edu', [1]),\n",
       " ('/2', [1]),\n",
       " ('error', [1]),\n",
       " ('mail-bouncing', [1]),\n",
       " ('FL', [1]),\n",
       " ('comes.', [1]),\n",
       " ('8th', [2]),\n",
       " ('restrict', [1]),\n",
       " ('beyond', [1]),\n",
       " ('Enhancements', [1]),\n",
       " ('9', [1]),\n",
       " ('forsale', [1]),\n",
       " ('Main', [1]),\n",
       " ('see?', [1]),\n",
       " ('conclusion.', [1]),\n",
       " ('yesterday', [1]),\n",
       " ('runs', [1, 2]),\n",
       " ('cold', [2]),\n",
       " ('problem,', [1]),\n",
       " ('acquisition/control,', [1]),\n",
       " (\">Site's\", [1]),\n",
       " ('accessories', [1]),\n",
       " ('paying', [3]),\n",
       " ('MIT', [1]),\n",
       " ('write', [1, 1]),\n",
       " ('>You', [1]),\n",
       " ('(Charles', [1]),\n",
       " ('keeping', [1, 1, 1]),\n",
       " ('1991', [1, 1]),\n",
       " ('Nodine)', [1]),\n",
       " ('wrong,', [1]),\n",
       " ('male,', [1]),\n",
       " ('side', [1, 1]),\n",
       " ('po4.andrew.cmu.edu', [1]),\n",
       " ('adresses', [1]),\n",
       " ('Matic', [1]),\n",
       " ('chip', [4]),\n",
       " ('year,', [1, 3, 1]),\n",
       " ('seige,', [1]),\n",
       " ('fixed', [1]),\n",
       " ('uprising', [1]),\n",
       " ('924.', [1]),\n",
       " ('possible.', [1, 1, 1]),\n",
       " ('accepted,', [1]),\n",
       " ('count', [1]),\n",
       " ('products', [1]),\n",
       " ('cancelled.', [1]),\n",
       " ('is.', [1]),\n",
       " ('edit,', [1]),\n",
       " ('pick', [1, 1]),\n",
       " ('Home', [1]),\n",
       " ('external', [3]),\n",
       " ('0', [2]),\n",
       " ('obviously,', [1]),\n",
       " ('thinking', [1, 1, 1, 1]),\n",
       " ('unknowable.', [1]),\n",
       " ('Einstein=======', [1]),\n",
       " ('self', [2]),\n",
       " ('import', [1]),\n",
       " ('holmes7000@iscsvax.uni.edu', [1]),\n",
       " ('certainly', [1]),\n",
       " ('Times', [1]),\n",
       " ('fed', [1]),\n",
       " (\"'new\", [1]),\n",
       " ('Brady', [1]),\n",
       " ('(Adam', [1]),\n",
       " ('somwhere,', [1]),\n",
       " ('vs.', [1]),\n",
       " ('=======>', [1]),\n",
       " ('60', [1, 2]),\n",
       " ('DISCLAIMER:', [1]),\n",
       " ('facets', [1]),\n",
       " ('slc10.ins.cwru.edu', [1]),\n",
       " ('Kerr', [1]),\n",
       " ('1990', [1]),\n",
       " (\"Investors'\", [1]),\n",
       " ('containing', [1]),\n",
       " ('drugs,', [1, 1]),\n",
       " ('>federal', [1]),\n",
       " ('made', [2, 1]),\n",
       " ('>cover', [1]),\n",
       " ('Stack', [1]),\n",
       " ('tific', [1]),\n",
       " ('80Mb', [1]),\n",
       " ('Student', [1]),\n",
       " ('priorities.', [1]),\n",
       " ('10MB/s', [4]),\n",
       " ('Urbana', [1]),\n",
       " ('anti', [1]),\n",
       " ('24.', [1]),\n",
       " ('\"Although', [2]),\n",
       " ('>>20%', [1]),\n",
       " ('Viola', [3]),\n",
       " ('fell', [1]),\n",
       " ('tg@cs.toronto.edu', [1]),\n",
       " ('now,', [1]),\n",
       " ('36', [1, 1]),\n",
       " ('ask', [1, 1, 1]),\n",
       " (\"A's\", [1]),\n",
       " ('A,', [1]),\n",
       " ('DSO,', [1]),\n",
       " ('here,', [1]),\n",
       " ('criticizing', [1]),\n",
       " ('open', [1]),\n",
       " ('showing', [1]),\n",
       " ('too!)', [1]),\n",
       " ('time)', [1]),\n",
       " ('gets', [1, 1, 1, 1]),\n",
       " ('before,', [1]),\n",
       " ('William', [1]),\n",
       " ('honk', [1]),\n",
       " ('VTT', [1]),\n",
       " ('Permanent', [2]),\n",
       " ('power.', [2, 1]),\n",
       " ('\"God,', [1]),\n",
       " ('entity,', [1]),\n",
       " ('writers', [1, 1]),\n",
       " ('legitimate', [1]),\n",
       " ('motion,', [1]),\n",
       " ('that.\"', [1]),\n",
       " ('>...what', [1]),\n",
       " ('MORE', [1]),\n",
       " ('dual', [1]),\n",
       " ('owned', [1]),\n",
       " ('respective', [1]),\n",
       " (\"driver's\", [1]),\n",
       " ('(4/23)', [1]),\n",
       " ('DoD#', [1]),\n",
       " ('here?', [1]),\n",
       " ('SCSi-2', [1]),\n",
       " ('used', [3, 1, 1, 1, 1, 1]),\n",
       " ('outside', [1, 1]),\n",
       " ('(then', [1]),\n",
       " ('turning', [1]),\n",
       " ('0-5MB/s.', [1]),\n",
       " ('>water.', [1]),\n",
       " ('interest.', [1]),\n",
       " ('Askew)', [1]),\n",
       " ('maybe', [1, 1, 1, 1, 1]),\n",
       " ...]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(shuffle_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# NB Predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "from operator import itemgetter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "word_search_re = re.compile(r\"[\\w']+\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def load_model(model_filename):\n",
    "    model = defaultdict(lambda: defaultdict(float))\n",
    "    with open(model_filename) as inf:\n",
    "        for line in inf:\n",
    "            word, values = line.split(maxsplit=1)\n",
    "            word = eval(word)\n",
    "            values = eval(values)\n",
    "            model[word] = values\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"part-00000\")\n",
    "model = load_model(model_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(409.7987003114851, 513.3231594734408)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model[\"i\"][\"male\"], model[\"i\"][\"female\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def nb_predict(model, document):\n",
    "    words = word_search_re.findall(document)\n",
    "    probabilities = defaultdict(lambda : 0)\n",
    "    for word in set(words):\n",
    "        probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-5))\n",
    "        probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-5))\n",
    "    # Now find the most likely gender\n",
    "    most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
    "    return most_likely_genders[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "new_post = \"\"\" Every day should be a half day.  Took the afternoon off to hit the dentist, and while I was out I managed to get my oil changed, too.  Remember that business with my car dealership this winter?  Well, consider this the epilogue.  The friendly fellas at the Valvoline Instant Oil Change on Snelling were nice enough to notice that my dipstick was broken, and the metal piece was too far down in its little dipstick tube to pull out.  Looks like I'm going to need a magnet.   Damn you, Kline Nissan, daaaaaaammmnnn yooouuuu....   Today I let my boss know that I've submitted my Corps application.  The news has been greeted by everyone in the company with a level of enthusiasm that really floors me.     The back deck has finally been cleared off by the construction company working on the place.  This company, for anyone who's interested, consists mainly of one guy who spends his days cursing at his crew of Spanish-speaking laborers.  Construction of my deck began around the time Nixon was getting out of office.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'male'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb_predict(model, new_post)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "testing_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogposts_testing\")\n",
    "testing_filenames = []\n",
    "for filename in os.listdir(testing_folder):\n",
    "    testing_filenames.append(os.path.join(testing_folder, filename))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def nb_predict_many(model, input_filename):\n",
    "    with open(input_filename) as inf:\n",
    "        # remove leading and trailing whitespace\n",
    "        for line in inf:\n",
    "            tokens = line.split()\n",
    "            actual_gender = eval(tokens[0])\n",
    "            blog_post = eval(\" \".join(tokens[1:]))\n",
    "            yield actual_gender, nb_predict(model, blog_post)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def nb_predict(model, document):\n",
    "    words = word_search_re.findall(document)\n",
    "    probabilities = defaultdict(lambda : 1)\n",
    "    for word in set(words):\n",
    "        probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-15))\n",
    "        probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-15))\n",
    "    # Now find the most likely gender\n",
    "    most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
    "    return most_likely_genders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "y_true = []\n",
    "y_pred = []\n",
    "for testing_filename in testing_filenames:\n",
    "    for actual_gender, ratios in nb_predict_many(model, testing_filename):\n",
    "        predicted_gender = ratios[0][0]\n",
    "        y_true.append(actual_gender == \"female\")\n",
    "        y_pred.append(predicted_gender == \"female\")\n",
    "y_true = np.array(y_true, dtype='int')\n",
    "y_pred = np.array(y_pred, dtype='int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "f1=0.5540\n",
      "acc=0.5765\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
    "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))\n",
    "      \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "aws_model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"model_aws\")\n",
    "aws_model = load_model(aws_model_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "y_true = []\n",
    "y_pred = []\n",
    "for testing_filename in testing_filenames:\n",
    "    for actual_gender, predicted_gender in nb_predict_many(aws_model, testing_filename):\n",
    "        predicted_gender = ratios[0][0]\n",
    "        y_true.append(actual_gender == \"female\")\n",
    "        y_pred.append(predicted_gender == \"female\")\n",
    "        #print(\"Actual: {0}\\tPredicted: {1}\".format(actual_gender, predicted_gender))\n",
    "        if len(y_true) > 500:\n",
    "            break\n",
    "y_true = np.array(y_true, dtype='int')\n",
    "y_pred = np.array(y_pred, dtype='int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "f1=0.8144\n",
      "acc=0.8734\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.4/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
    "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]\n"
     ]
    }
   ],
   "source": [
    "print(list(zip(y_true, y_pred))[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[614,   0],\n",
       "       [ 89,   0]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "confusion_matrix(y_true, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Test load"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "filename = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogs\", \"1005545.male.25.Engineering.Sagittarius.xml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "all_posts = []\n",
    "with open(filename) as inf:\n",
    "    # remove leading and trailing whitespace\n",
    "    post_start = False\n",
    "    post = []\n",
    "    for line in inf:\n",
    "        line = line.strip()\n",
    "        if line == \"<post>\":\n",
    "            post_start = True\n",
    "        elif line == \"</post>\":\n",
    "            post_start = False\n",
    "            all_posts.append(\"\\n\".join(post))\n",
    "            post = []\n",
    "        elif post_start:\n",
    "            post.append(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "80"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(all_posts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "81px",
    "width": "253px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
